DATA PROCESSING

load libraries

packages = c("readxl","ggplot2","tidyverse","dplyr","lubridate","lessR","MatchIt",
             "cobalt", "eeptools","grf","tidymodels", "fastDummies")

# install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
# packages loading
invisible(lapply(packages, library, character.only = TRUE))
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'lubridate'
## 
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## 
## 
## lessR 4.2.8                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
##   Enter:  browseVignettes("lessR")
## 
## View changes in this and recent versions of lessR.
##   Enter: news(package="lessR")
## 
## Interactive data analysis.
##   Enter: interact()
## 
## 
## 
## Attaching package: 'lessR'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     recode, rename
## 
## 
##  cobalt (Version 4.5.1, Build Date: 2023-04-27)
## 
## 
## Attaching package: 'cobalt'
## 
## 
## The following object is masked from 'package:MatchIt':
## 
##     lalonde
## 
## 
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## 
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.0.0     ✔ tune         1.0.1
## ✔ infer        1.0.3     ✔ workflows    1.1.0
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.2     ✔ yardstick    1.1.0
## ✔ recipes      1.0.2     
## 
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ lessR::recode()   masks dplyr::recode()
## ✖ lessR::rename()   masks dplyr::rename()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
## 
## Thank you for using fastDummies!
## 
## To acknowledge our work, please cite the package:
## 
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.

load the data

msg_1 <- read_excel("~/Desktop/Monkee data/Messages_1 with userid.xlsx")
msg_2 <- read_excel("~/Desktop/Monkee data/messages_2_with userid.xlsx")
goals <- read_excel("~/Desktop/Monkee data/Monkee_goals.xlsx")
users <- read_excel("~/Desktop/Monkee data/Monkee_users.xlsx")

messages dataset clean up

# merge the message datasets
messages <- rbind(msg_1, msg_2) 

# count NAs and decide NAs in which column need to be removed 
colSums(is.na(messages))
##                user_id           message_uuid           message_type 
##                      0                      1                      1 
##         message_status                send_at             reacted_at 
##                      1                   1376                   1376 
##          message_title        message_message         message_action 
##                   1376                   1376                   1376 
##             message_id message_button_label_1 message_button_label_2 
##                   2751                   2751                   2755 
## message_button_label_3         button_pressed 
##                   2757                 453073
msg_subset <- messages[, c(2:13)]
clean_rows <- complete.cases(msg_subset)
messages_clean <- messages[clean_rows,]
colSums(is.na(messages_clean))
##                user_id           message_uuid           message_type 
##                      0                      0                      0 
##         message_status                send_at             reacted_at 
##                      0                      0                      0 
##          message_title        message_message         message_action 
##                      0                      0                      0 
##             message_id message_button_label_1 message_button_label_2 
##                      0                      0                      0 
## message_button_label_3         button_pressed 
##                      0                 450322
sapply(messages_clean, function(x) n_distinct(x)) #5,008 users
##                user_id           message_uuid           message_type 
##                   5008                 641665                      1 
##         message_status                send_at             reacted_at 
##                      2                 239374                 565468 
##          message_title        message_message         message_action 
##                      6                  43078                   2598 
##             message_id message_button_label_1 message_button_label_2 
##                     18                    757                      4 
## message_button_label_3         button_pressed 
##                      3                    167

goals dataset cleanup

colSums(is.na(goals))
##          user_id          goal_id    starting_date         end_date 
##                0                1                1                1 
##           status      archived_at    target_amount      week_target 
##                1            16186                1                1 
## starting_capital         priority   current_amount         category 
##                1                1                1             7517 
##     sub_category      saving_mode   goal_on_target 
##            15277                1                1
goals <- goals[!is.na(goals$goal_id),]

table(goals$priority) # need to remove rows with a value of u=3, i (15 obs)
## 
##   HIGH NORMAL u=3, i 
##   7476  19465     15
goals_clean <- goals[!goals$priority == 'u=3, i',] 

colSums(is.na(goals_clean))
##          user_id          goal_id    starting_date         end_date 
##                0                0                0                0 
##           status      archived_at    target_amount      week_target 
##                0            16176                0                0 
## starting_capital         priority   current_amount         category 
##                0                0                0             7508 
##     sub_category      saving_mode   goal_on_target 
##            15266                0                0
sapply(goals_clean, function(x) n_distinct(x)) #11,779 users #26,941 goals #14 categories
##          user_id          goal_id    starting_date         end_date 
##            11779            26941            26940             3306 
##           status      archived_at    target_amount      week_target 
##                3            10764             1462             4981 
## starting_capital         priority   current_amount         category 
##             3090                2            10125               14 
##     sub_category      saving_mode   goal_on_target 
##               85                2                2

users dataset cleanup

colSums(is.na(users))
##                   user_id             date_of_birth                    gender 
##                         0                         1                         0 
##                 joined_at                   country           treatment_group 
##                         0                         0                     18094 
##          operating_system           number_of_goals    number_of_goals_active 
##                      8645                      6314                      6314 
##  number_of_goals_achieved number_of_goals_on_target 
##                      6314                      6314
users_clean <- users[!is.na(users$number_of_goals),] 
colSums(is.na(users_clean))
##                   user_id             date_of_birth                    gender 
##                         0                         0                         0 
##                 joined_at                   country           treatment_group 
##                         0                         0                     11780 
##          operating_system           number_of_goals    number_of_goals_active 
##                      4782                         0                         0 
##  number_of_goals_achieved number_of_goals_on_target 
##                         0                         0
sapply(users_clean, function(x) n_distinct(x)) #11,780 users 
##                   user_id             date_of_birth                    gender 
##                     11780                      7365                         2 
##                 joined_at                   country           treatment_group 
##                     11780                         2                         1 
##          operating_system           number_of_goals    number_of_goals_active 
##                         5                        54                        30 
##  number_of_goals_achieved number_of_goals_on_target 
##                        15                        28

merge the cleaned data sets, by user_id

joint <- list(messages_clean, goals_clean, users_clean) %>% 
  reduce(inner_join, by = 'user_id')  #213,326

sapply(joint, function(x) n_distinct(x)) #4,988 users #641,391 msgs #19,242 goals
##                   user_id              message_uuid              message_type 
##                      4988                    641391                         1 
##            message_status                   send_at                reacted_at 
##                         2                    239239                    565312 
##             message_title           message_message            message_action 
##                         6                     43050                      2598 
##                message_id    message_button_label_1    message_button_label_2 
##                        18                       756                         4 
##    message_button_label_3            button_pressed                   goal_id 
##                         3                       167                     19242 
##             starting_date                  end_date                    status 
##                     19241                      3018                         3 
##               archived_at             target_amount               week_target 
##                     10636                      1208                      4306 
##          starting_capital                  priority            current_amount 
##                      3088                         2                     10073 
##                  category              sub_category               saving_mode 
##                        14                        85                         2 
##            goal_on_target             date_of_birth                    gender 
##                         2                      4002                         2 
##                 joined_at                   country           treatment_group 
##                      4988                         2                         1 
##          operating_system           number_of_goals    number_of_goals_active 
##                         5                        54                        30 
##  number_of_goals_achieved number_of_goals_on_target 
##                        15                        28

multi- vs single-goal users

multi_goal <- joint %>% 
  filter(number_of_goals > 1)

sapply(multi_goal, function(x) n_distinct(x)) #3,215 multi-goal users 
##                   user_id              message_uuid              message_type 
##                      3215                    464448                         1 
##            message_status                   send_at                reacted_at 
##                         2                    169836                    403461 
##             message_title           message_message            message_action 
##                         6                     35954                      1818 
##                message_id    message_button_label_1    message_button_label_2 
##                        18                       629                         4 
##    message_button_label_3            button_pressed                   goal_id 
##                         3                       144                     17469 
##             starting_date                  end_date                    status 
##                     17468                      2951                         3 
##               archived_at             target_amount               week_target 
##                     10029                      1143                      4087 
##          starting_capital                  priority            current_amount 
##                      2959                         2                      9513 
##                  category              sub_category               saving_mode 
##                        14                        85                         2 
##            goal_on_target             date_of_birth                    gender 
##                         2                      2790                         2 
##                 joined_at                   country           treatment_group 
##                      3215                         2                         1 
##          operating_system           number_of_goals    number_of_goals_active 
##                         5                        53                        30 
##  number_of_goals_achieved number_of_goals_on_target 
##                        15                        28

seperate and focus on single-goal users

single_goal <- joint %>% 
  filter(number_of_goals == 1)

sapply(single_goal, function(x) n_distinct(x)) #1,773 single-goal users 
##                   user_id              message_uuid              message_type 
##                      1773                    176943                         1 
##            message_status                   send_at                reacted_at 
##                         2                     69836                    165247 
##             message_title           message_message            message_action 
##                         6                     11544                      1471 
##                message_id    message_button_label_1    message_button_label_2 
##                        16                       466                         4 
##    message_button_label_3            button_pressed                   goal_id 
##                         3                        74                      1773 
##             starting_date                  end_date                    status 
##                      1773                       799                         3 
##               archived_at             target_amount               week_target 
##                       608                       258                       911 
##          starting_capital                  priority            current_amount 
##                       202                         2                      1005 
##                  category              sub_category               saving_mode 
##                        14                        65                         2 
##            goal_on_target             date_of_birth                    gender 
##                         2                      1633                         2 
##                 joined_at                   country           treatment_group 
##                      1773                         2                         1 
##          operating_system           number_of_goals    number_of_goals_active 
##                         5                         1                         2 
##  number_of_goals_achieved number_of_goals_on_target 
##                         2                         2
table(single_goal$message_id) #message_id of interest: 2,3,4; 70,71,72; 73,74,75
## 
##     1     2     3     4     5     6     7    58    62    70    71    72    73 
## 20310  1082  4816  3558 11445  9957 10325 98601 12889    22   160   133  3316 
##    74    75    78 
##   239    89     1

filter relevant messages & msg outside the experiment period

sg_filtered <- single_goal[single_goal$message_id %in% 
                                c('2', '3', '4',
                                  '70', '71', '72', 
                                  '73', '74', '75'), ] %>% 
  filter(send_at >= as.POSIXct('2021-05-01 0:00:00'))

sapply(sg_filtered, function(x) n_distinct(x)) #1,112 users #9,583 msg 
##                   user_id              message_uuid              message_type 
##                      1112                      9583                         1 
##            message_status                   send_at                reacted_at 
##                         2                      6911                      9550 
##             message_title           message_message            message_action 
##                         2                      1677                      1399 
##                message_id    message_button_label_1    message_button_label_2 
##                         9                       419                         1 
##    message_button_label_3            button_pressed                   goal_id 
##                         2                        69                      1112 
##             starting_date                  end_date                    status 
##                      1112                       550                         3 
##               archived_at             target_amount               week_target 
##                       285                       183                       632 
##          starting_capital                  priority            current_amount 
##                       146                         2                       788 
##                  category              sub_category               saving_mode 
##                        14                        62                         2 
##            goal_on_target             date_of_birth                    gender 
##                         2                      1050                         2 
##                 joined_at                   country           treatment_group 
##                      1112                         2                         1 
##          operating_system           number_of_goals    number_of_goals_active 
##                         5                         1                         2 
##  number_of_goals_achieved number_of_goals_on_target 
##                         2                         2

VARIABLES

DV1: viewed <- message_status

sg_filtered$viewed <- as.numeric(sg_filtered$message_status == 'VIEWED')

DV2: saving_response <- button_pressed

sg_filtered$saving_response <- ifelse(
  sg_filtered$button_pressed == 'Später' | 
    sg_filtered$button_pressed == 'Nein, danke',
  0,
  1)

treat

# control 
sg_filtered$benchmark = sg_filtered$message_id == 2 |
  sg_filtered$message_id == 3 | sg_filtered$message_id == 4 
table(sg_filtered$benchmark) #5624
## 
## FALSE  TRUE 
##  3959  5624
# absolute 
sg_filtered$absolute = sg_filtered$message_id == 70 | 
  sg_filtered$message_id == 71 | sg_filtered$message_id == 72
table(sg_filtered$absolute) #315
## 
## FALSE  TRUE 
##  9268   315
# percentage 
sg_filtered$percentage = sg_filtered$message_id == 73 | 
  sg_filtered$message_id == 74 | sg_filtered$message_id == 75
table(sg_filtered$percentage) #3644
## 
## FALSE  TRUE 
##  5939  3644
sg_filtered$treat = sg_filtered$benchmark
sg_filtered$treat[sg_filtered$benchmark] = "control"
sg_filtered$treat[sg_filtered$absolute] = "absolute"
sg_filtered$treat[sg_filtered$percentage] = "percentage"

sg_filtered$treat = as.factor(sg_filtered$treat)
sg_filtered$treat = relevel(sg_filtered$treat, 
                                       ref = "control")

additional covariates

goals: time length

# goal_length
sg_filtered$goal_length <- 
  lubridate::time_length(
    difftime(as.Date(sg_filtered$end_date),
             as.Date(sg_filtered$starting_date)),
    "years")

users: age & tenure

# pick the end point as on the day after the experiment ended
# age
sg_filtered$age <- floor(age_calc(as.Date(sg_filtered$date_of_birth),
                                  as.Date('2021-10-05'),
                                  units = "years"))

# tenure
sg_filtered$tenure <- lubridate::time_length(difftime(as.Date('2021-10-05'),
                                                            as.Date(sg_filtered$joined_at)),
                                                   "years")

now that we have all the variables in hand, the next step is to pick important (pre-treatment) covariates for the analysis:

drop “goal_on_target” and “current_amount” as they are measured at the end of the experiment

# goal-related: goal_length, week_target, starting_capital, priority, category, saving_mode,
# user-related: gender, country, age, tenure

# convert categorical vars into correct data types 

cat.var <- c('priority',
             'category',
             'saving_mode',
             'gender',
             'country')

cont.var <- c('goal_length',
              'week_target',
              'starting_capital',
              'age',
              'tenure')

sg_filtered[, cat.var] <- lapply(sg_filtered[, cat.var], factor, exclude = NULL)

# the cateogry variable contains NA values, however it should be treated as an unknown category instead 
levels(sg_filtered$category)[match(NA, levels(sg_filtered$category))] <- "UNKNOWN"

# final df
final_df <- sg_filtered %>% dplyr::select('viewed', 
                                          'saving_response',
                                          'treat', 
                                          'message_id',
                                          cat.var, 
                                          cont.var) #9583

# make dummies for cat var
final_df = dummy_cols(final_df, select_columns = c("priority", "category", "saving_mode",
                                                   "gender", "country"))
final_df = final_df[, -c(5:9)]

# convert NAs in saving_response into 0
final_df["saving_response"][is.na(final_df["saving_response"])] <- 0

# additional df for saving_response (conditional on viewing): final_df_cond
final_df_cond <- final_df[final_df$viewed == 1,]

subset df for treatment pairs based on final_df

# control vs absolute
ca <- subset(final_df, treat %in% c('control','absolute')) #5939
ca$treat <- ifelse(ca$treat == "control", 0, 1) 

# control vs percentage
cp <- subset(final_df, treat %in% c('control','percentage')) #9268
cp$treat <- ifelse(cp$treat == "control", 0, 1)

# absolute vs percentage 
ap <- subset(final_df, treat %in% c('absolute','percentage')) #3959
ap$treat <- ifelse(ap$treat == "absolute", 0, 1)

subset df for treatment pairs based on final_df_cond

# control vs absolute
ca_cond <- subset(final_df_cond, treat %in% c('control','absolute')) #1736
ca_cond$treat <- ifelse(ca_cond$treat == "control", 0, 1) 

# control vs percentage
cp_cond <- subset(final_df_cond, treat %in% c('control','percentage')) #2471
cp_cond$treat <- ifelse(cp_cond$treat == "control", 0, 1)

# absolute vs percentage 
ap_cond <- subset(final_df_cond, treat %in% c('absolute','percentage')) #971
ap_cond$treat <- ifelse(ap_cond$treat == "absolute", 0, 1)

RANDOMIZATION CHECK

for treat df

  • control vs absolute
W.ca = ca$treat
X.ca = ca[,c(5:31)]
m.ca = matchit(
  as.formula(paste("W.ca ~ ", paste(colnames(X.ca), collapse = " + "))),
  data = ca,
  method = "nearest",
  distance = "glm")

plot(summary(m.ca))

bal.tab(m.ca)
## Balance Measures
##                                    Type Diff.Adj
## distance                       Distance   0.2078
## goal_length                     Contin.  -0.0063
## week_target                     Contin.   0.0531
## starting_capital                Contin.   0.0142
## age                             Contin.  -0.1279
## tenure                          Contin.   0.0953
## priority_HIGH                    Binary  -0.0571
## category_EDUCATION               Binary   0.0000
## category_FINANCIAL_FREEDOM       Binary  -0.0095
## category_HEALTH                  Binary   0.0000
## category_HOME_ENTERTAINMENT      Binary   0.0444
## category_HOME_LIVING             Binary  -0.0254
## category_HOUSEHOLD_APPLIANCES    Binary   0.0000
## category_LEISURE_ENTERTAINMENT   Binary   0.0000
## category_LIFE_EVENTS_PRESENTS    Binary   0.0000
## category_LIFESTYLE               Binary   0.0000
## category_MOBILITY                Binary   0.0127
## category_RECURRING_COSTS         Binary  -0.0349
## category_SPORTS_EQUIPMENT        Binary   0.0000
## category_TRAVEL_HOLIDAY          Binary  -0.0317
## category_UNKNOWN                 Binary   0.0444
## saving_mode_AUTOPILOT            Binary  -0.0571
## gender_FEMALE                    Binary  -0.0286
## country_AT                       Binary   0.0413
## 
## Sample sizes
##           Control Treated
## All          5624     315
## Matched       315     315
## Unmatched    5309       0
  • control vs percentage
W.cp = cp$treat
X.cp = cp[,c(5:31)]
m.cp = matchit(
  as.formula(paste("W.cp ~ ", paste(colnames(X.cp), collapse = " + "))),
  data = cp,
  method = "full",
  distance = "glm")

plot(summary(m.cp))

bal.tab(m.cp)
## Balance Measures
##                                    Type Diff.Adj
## distance                       Distance   0.0020
## goal_length                     Contin.   0.0073
## week_target                     Contin.  -0.0780
## starting_capital                Contin.  -0.0763
## age                             Contin.   0.0012
## tenure                          Contin.   0.0061
## priority_HIGH                    Binary  -0.1075
## category_EDUCATION               Binary  -0.0072
## category_FINANCIAL_FREEDOM       Binary   0.0202
## category_HEALTH                  Binary  -0.0003
## category_HOME_ENTERTAINMENT      Binary  -0.0130
## category_HOME_LIVING             Binary  -0.0682
## category_HOUSEHOLD_APPLIANCES    Binary   0.0005
## category_LEISURE_ENTERTAINMENT   Binary   0.0053
## category_LIFE_EVENTS_PRESENTS    Binary   0.0062
## category_LIFESTYLE               Binary  -0.0030
## category_MOBILITY                Binary  -0.0203
## category_RECURRING_COSTS         Binary   0.0001
## category_SPORTS_EQUIPMENT        Binary   0.0092
## category_TRAVEL_HOLIDAY          Binary   0.0642
## category_UNKNOWN                 Binary   0.0064
## saving_mode_AUTOPILOT            Binary  -0.0582
## gender_FEMALE                    Binary  -0.0231
## country_AT                       Binary   0.0368
## 
## Sample sizes
##                      Control Treated
## All                  5624.      3644
## Matched (ESS)         186.53    3644
## Matched (Unweighted) 5624.      3644
  • absolute vs percentage
W.ap = ap$treat
X.ap = ap[,c(5:31)]
m.ap = matchit(
  as.formula(paste("W.ap ~ ", paste(colnames(X.ap), collapse = " + "))),
  data = ap,
  method = "full",
  distance = "glm")

plot(summary(m.ap))

bal.tab(m.ap)
## Balance Measures
##                                    Type Diff.Adj
## distance                       Distance   0.0009
## goal_length                     Contin.  -0.1134
## week_target                     Contin.   0.2164
## starting_capital                Contin.  -0.0218
## age                             Contin.  -0.0496
## tenure                          Contin.   0.4342
## priority_HIGH                    Binary   0.0172
## category_EDUCATION               Binary   0.0296
## category_FINANCIAL_FREEDOM       Binary  -0.0243
## category_HEALTH                 Contin.   0.0000
## category_HOME_ENTERTAINMENT      Binary   0.0147
## category_HOME_LIVING             Binary   0.0081
## category_HOUSEHOLD_APPLIANCES    Binary   0.0063
## category_LEISURE_ENTERTAINMENT   Binary   0.0074
## category_LIFE_EVENTS_PRESENTS    Binary  -0.0307
## category_LIFESTYLE               Binary  -0.0052
## category_MOBILITY                Binary   0.0056
## category_RECURRING_COSTS         Binary   0.0019
## category_SPORTS_EQUIPMENT        Binary  -0.0071
## category_TRAVEL_HOLIDAY          Binary  -0.0140
## category_UNKNOWN                 Binary   0.0077
## saving_mode_AUTOPILOT            Binary  -0.1313
## gender_FEMALE                    Binary  -0.0245
## country_AT                       Binary   0.0182
## 
## Sample sizes
##                      Control Treated
## All                   315.      3644
## Matched (ESS)          44.43    3644
## Matched (Unweighted)  315.      3644

for conditional treat df

W.ca.cond = ca_cond$treat
X.ca.cond = ca_cond[,c(5:31)]
m.ca.cond = matchit(
  as.formula(paste("W.ca.cond ~ ", paste(colnames(X.ca.cond), collapse = " + "))),
  data = ca_cond,
  method = "full",
  distance = "glm")

plot(summary(m.ca.cond))

bal.tab(m.ca.cond)
## Balance Measures
##                                    Type Diff.Adj
## distance                       Distance   0.0105
## goal_length                     Contin.  -0.0283
## week_target                     Contin.   0.0757
## starting_capital                Contin.  -0.0160
## age                             Contin.   0.1861
## tenure                          Contin.  -0.2235
## priority_HIGH                    Binary  -0.0763
## category_EDUCATION               Binary  -0.0009
## category_FINANCIAL_FREEDOM       Binary   0.0039
## category_HEALTH                 Contin.   0.0000
## category_HOME_ENTERTAINMENT      Binary   0.0059
## category_HOME_LIVING             Binary  -0.0001
## category_HOUSEHOLD_APPLIANCES    Binary  -0.0008
## category_LEISURE_ENTERTAINMENT   Binary   0.0022
## category_LIFE_EVENTS_PRESENTS    Binary   0.0039
## category_LIFESTYLE               Binary   0.0010
## category_MOBILITY                Binary  -0.0512
## category_RECURRING_COSTS         Binary   0.0023
## category_SPORTS_EQUIPMENT        Binary  -0.0022
## category_TRAVEL_HOLIDAY          Binary  -0.0047
## category_UNKNOWN                 Binary   0.0408
## saving_mode_AUTOPILOT            Binary   0.0127
## gender_FEMALE                    Binary   0.0239
## country_AT                       Binary   0.0111
## 
## Sample sizes
##                      Control Treated
## All                  1618.       118
## Matched (ESS)         116.73     118
## Matched (Unweighted) 1618.       118
W.cp.cond = cp_cond$treat
X.cp.cond = cp_cond[,c(5:31)]
m.cp.cond = matchit(
  as.formula(paste("W.cp.cond ~ ", paste(colnames(X.cp.cond), collapse = " + "))),
  data = cp_cond,
  method = "full",
  distance = "glm")

plot(summary(m.cp.cond))

bal.tab(m.cp.cond)
## Balance Measures
##                                    Type Diff.Adj
## distance                       Distance  -0.0006
## goal_length                     Contin.   0.0472
## week_target                     Contin.   0.0974
## starting_capital                Contin.   0.0719
## age                             Contin.  -0.0676
## tenure                          Contin.   0.0074
## priority_HIGH                    Binary  -0.0140
## category_EDUCATION               Binary   0.0089
## category_FINANCIAL_FREEDOM       Binary  -0.0019
## category_HEALTH                 Contin.   0.0000
## category_HOME_ENTERTAINMENT      Binary  -0.0005
## category_HOME_LIVING             Binary  -0.0133
## category_HOUSEHOLD_APPLIANCES    Binary  -0.0109
## category_LEISURE_ENTERTAINMENT   Binary   0.0069
## category_LIFE_EVENTS_PRESENTS    Binary  -0.0011
## category_LIFESTYLE               Binary   0.0052
## category_MOBILITY                Binary  -0.0171
## category_RECURRING_COSTS         Binary  -0.0014
## category_SPORTS_EQUIPMENT        Binary   0.0036
## category_TRAVEL_HOLIDAY          Binary  -0.0518
## category_UNKNOWN                 Binary   0.0735
## saving_mode_AUTOPILOT            Binary  -0.0482
## gender_FEMALE                    Binary  -0.0323
## country_AT                       Binary  -0.0133
## 
## Sample sizes
##                      Control Treated
## All                  1618.       853
## Matched (ESS)         143.47     853
## Matched (Unweighted) 1618.       853
W.ap.cond = ap_cond$treat
X.ap.cond = ap_cond[,c(5:31)]
m.ap.cond = matchit(
  as.formula(paste("W.ap.cond ~ ", paste(colnames(X.ap.cond), collapse = " + "))),
  data = ap_cond,
  method = "full",
  distance = "glm")

plot(summary(m.ap.cond))

bal.tab(m.ap.cond)
## Balance Measures
##                                    Type Diff.Adj
## distance                       Distance   0.0476
## goal_length                     Contin.  -0.0533
## week_target                     Contin.   0.1337
## starting_capital                Contin.   0.0790
## age                             Contin.   0.1324
## tenure                          Contin.   0.1167
## priority_HIGH                    Binary  -0.0788
## category_EDUCATION               Binary   0.0176
## category_FINANCIAL_FREEDOM       Binary  -0.0539
## category_HEALTH                 Contin.   0.0000
## category_HOME_ENTERTAINMENT      Binary   0.0434
## category_HOME_LIVING             Binary  -0.0234
## category_HOUSEHOLD_APPLIANCES    Binary   0.0223
## category_LEISURE_ENTERTAINMENT   Binary   0.0094
## category_LIFE_EVENTS_PRESENTS    Binary  -0.0152
## category_LIFESTYLE               Binary   0.0094
## category_MOBILITY                Binary   0.0290
## category_RECURRING_COSTS         Binary   0.0012
## category_SPORTS_EQUIPMENT        Binary   0.0305
## category_TRAVEL_HOLIDAY          Binary  -0.1028
## category_UNKNOWN                 Binary   0.0328
## saving_mode_AUTOPILOT            Binary   0.0853
## gender_FEMALE                    Binary  -0.0625
## country_AT                       Binary   0.0695
## 
## Sample sizes
##                      Control Treated
## All                   118.       853
## Matched (ESS)          22.04     853
## Matched (Unweighted)  118.       853

matched samples

ca_matched <- match.data(m.ca) 

cp_matched <- match.data(m.cp)

ap_matched <- match.data(m.ap)

ca_cond_matched <- match.data(m.ca.cond)

cp_cond_matched <- match.data(m.cp.cond)

ap_cond_matched <- match.data(m.ap.cond)

treatment allocation per matched sample

table(ca_matched$treat)
## 
##   0   1 
## 315 315
table(cp_matched$treat)
## 
##    0    1 
## 5624 3644
table(ap_matched$treat)
## 
##    0    1 
##  315 3644
table(ca_cond_matched$treat)
## 
##    0    1 
## 1618  118
table(cp_cond_matched$treat)
## 
##    0    1 
## 1618  853
table(ap_cond_matched$treat)
## 
##   0   1 
## 118 853

CAUSAL FOREST

# set seed & trees 
myseed = 1234
ntrees = 500

ATE

  • control vs absolute
# test-train split
ca_split <- initial_split(ca_matched, prop = 0.5)
ca_train <- training(ca_split) #315
ca_test <- testing(ca_split) #315 

X.ca.train = ca_train[, c(5:31)]
X.ca.test = ca_test[, c(5:31)]

Y.ca.train.v = ca_train$viewed
Y.ca.train.s = ca_train$saving_response

W.ca.train = ca_train$treat

# m(X) = E[Y | X]
Y.ca.forest.v = regression_forest(X.ca.train, Y.ca.train.v, num.trees = ntrees)
Y.ca.hat.v = predict(Y.ca.forest.v, X.ca.test)$predictions

Y.ca.forest.s = regression_forest(X.ca.train, Y.ca.train.s, num.trees = ntrees)
Y.ca.hat.s = predict(Y.ca.forest.s, X.ca.test)$predictions

# propensity score E[W | X]
W.ca.forest = regression_forest(X.ca.train, W.ca.train, num.trees = ntrees)
W.ca.hat = predict(W.ca.forest, X.ca.test)$predictions

# cf estimation 

cf_ca_v <- causal_forest(
  X.ca.train,
  Y.ca.train.v,
  W.ca.train,
  Y.hat = Y.ca.hat.v,
  W.hat = W.ca.hat,
  tune.parameters = "all",
  seed = myseed
  )

cf_ca_s <- causal_forest(
  X.ca.train,
  Y.ca.train.s,
  W.ca.train,
  Y.hat = Y.ca.hat.s,
  W.hat = W.ca.hat,
  tune.parameters = "all",
  seed = myseed
  )

# out-of-bag predictions
tau_hat_ca_v = predict(cf_ca_v, X.ca.test)$predictions

tau_hat_ca_s = predict(cf_ca_s, X.ca.test)$predictions

# plot tau.hat
plot(density(tau_hat_ca_v),
     main = "Predicted view rate CATEs: control vs. absolute")

plot(density(tau_hat_ca_s),
     main = "Predicted saving rate CATEs: control vs. absolute")

# ATE
ca_ATE_ps_v = average_treatment_effect(cf_ca_v)
paste("95% CI for the view rate ATE:", round(ca_ATE_ps_v[1], 3),
      "+/-", round(qnorm(0.975) * ca_ATE_ps_v[2], 3))
## [1] "95% CI for the view rate ATE: 0.054 +/- 0.131"
ca_ATE_ps_s = average_treatment_effect(cf_ca_s)
paste("95% CI for the saving rate ATE:", round(ca_ATE_ps_s[1], 3),
      "+/-", round(qnorm(0.975) * ca_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: -0.06 +/- 0.085"
ca_ATE_ps_v
##   estimate    std.err 
## 0.05418335 0.06703059
ca_ATE_ps_s
##    estimate     std.err 
## -0.06033495  0.04332149

for the conditional df:

# test-train split
ca_cond_split <- initial_split(ca_cond_matched, prop = 0.5)
ca_cond_train <- training(ca_cond_split) #868
ca_cond_test <- testing(ca_cond_split) #868

X.ca.cond.train = ca_cond_train[, c(5:31)]
X.ca.cond.test = ca_cond_test[, c(5:31)]

Y.ca.cond.train.s = ca_cond_train$saving_response

W.ca.cond.train = ca_cond_train$treat

# m(X) = E[Y | X]
Y.ca.cond.forest.s = regression_forest(X.ca.cond.train, Y.ca.cond.train.s, num.trees = ntrees)
Y.ca.cond.hat.s = predict(Y.ca.cond.forest.s, X.ca.cond.test)$predictions

# propensity score E[W | X]
W.ca.cond.forest = regression_forest(X.ca.cond.train, W.ca.cond.train, num.trees = ntrees)
W.ca.cond.hat = predict(W.ca.cond.forest, X.ca.cond.test)$predictions

# cf estimation 
cf_ca_cond_s <- causal_forest(
  X.ca.cond.train,
  Y.ca.cond.train.s,
  W.ca.cond.train,
  Y.hat = Y.ca.cond.hat.s,
  W.hat = W.ca.cond.hat, 
  tune.parameters = "all",
  seed = myseed
  )

# out-of-bag predictions
tau_hat_ca_cond_s = predict(cf_ca_cond_s, X.ca.cond.test)$predictions

# plot tau.hat
plot(density(tau_hat_ca_cond_s),
     main = "Predicted conditional saving rate CATEs: control vs. absolute")

# save rate ATE conditional on viewing
ca_cond_ATE_ps_s = average_treatment_effect(cf_ca_cond_s)
paste("95% CI for the conditional saving rate ATE:", round(ca_cond_ATE_ps_s[1], 3),
      "+/-", round(qnorm(0.975) * ca_cond_ATE_ps_s[2], 3))
## [1] "95% CI for the conditional saving rate ATE: 0.11 +/- 0.154"
ca_cond_ATE_ps_s 
##   estimate    std.err 
## 0.10958988 0.07849574
  • control vs percentage
# test-train split
cp_split <- initial_split(cp_matched, prop = 0.5)
cp_train <- training(cp_split) #4634
cp_test <- testing(cp_split) #4634 

X.cp.train = cp_train[, c(5:31)]
X.cp.test = cp_test[, c(5:31)]

Y.cp.train.v = cp_train$viewed
Y.cp.train.s = cp_train$saving_response

W.cp.train = cp_train$treat

# m(X) = E[Y | X]
Y.cp.forest.v = regression_forest(X.cp.train, Y.cp.train.v, num.trees = ntrees)
Y.cp.hat.v = predict(Y.cp.forest.v, X.cp.test)$predictions

Y.cp.forest.s = regression_forest(X.cp.train, Y.cp.train.s, num.trees = ntrees)
Y.cp.hat.s = predict(Y.cp.forest.s, X.cp.test)$predictions

# propensity score E[W | X]
W.cp.forest = regression_forest(X.cp.train, W.cp.train, num.trees = ntrees)
W.cp.hat = predict(W.cp.forest, X.cp.test)$predictions

# cf estimation 
cf_cp_v <- causal_forest(
  X.cp.train,
  Y.cp.train.v,
  W.cp.train,
  Y.hat = Y.cp.hat.v,
  W.hat = W.cp.hat,
  tune.parameters = "all",
  seed = myseed
  )

cf_cp_s <- causal_forest(
  X.cp.train,
  Y.cp.train.s,
  W.cp.train,
  Y.hat = Y.cp.hat.s,
  W.hat = W.cp.hat,
  tune.parameters = "all",
  seed = myseed
  )

# out-of-bag predictions
tau_hat_cp_v = predict(cf_cp_v, X.cp.test)$predictions

tau_hat_cp_s = predict(cf_cp_s, X.cp.test)$predictions

# plot tau.hat
plot(density(tau_hat_cp_v),
     main = "Predicted view rate CATEs: control vs. percentage")

plot(density(tau_hat_cp_s),
     main = "Predicted saving rate CATEs: control vs. percentage")

# ATE
cp_ATE_ps_v = average_treatment_effect(cf_cp_v)
paste("95% CI for the view rate ATE:", round(cp_ATE_ps_v[1], 3),
      "+/-", round(qnorm(0.975) * cp_ATE_ps_v[2], 3))
## [1] "95% CI for the view rate ATE: -0.316 +/- 0.129"
cp_ATE_ps_s = average_treatment_effect(cf_cp_s)
paste("95% CI for the saving rate ATE:", round(cp_ATE_ps_s[1], 3),
      "+/-", round(qnorm(0.975) * cp_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: -0.037 +/- 0.055"
cp_ATE_ps_v
##    estimate     std.err 
## -0.31636565  0.06596747
cp_ATE_ps_s
##    estimate     std.err 
## -0.03749626  0.02823887

for the conditional df:

# test-train split
cp_cond_split <- initial_split(cp_cond_matched, prop = 0.5)
cp_cond_train <- training(cp_cond_split) #1235
cp_cond_test <- testing(cp_cond_split) #1236

X.cp.cond.train = cp_cond_train[, c(5:31)]
X.cp.cond.test = cp_cond_test[, c(5:31)]

Y.cp.cond.train.s = cp_cond_train$saving_response

W.cp.cond.train = cp_cond_train$treat

# cf estimation 
cf_cp_cond_s <- causal_forest(
  X.cp.cond.train,
  Y.cp.cond.train.s,
  W.cp.cond.train,
  tune.parameters = "all",
  seed = myseed
  )

# out-of-bag predictions
tau_hat_cp_cond_s = predict(cf_cp_cond_s, X.cp.cond.test)$predictions

# plot tau.hat
plot(density(tau_hat_cp_cond_s),
     main = "Predicted conditional saving rate CATEs: control vs. percentage")

# save rate ATE conditional on viewing
cp_cond_ATE_ps_s = average_treatment_effect(cf_cp_cond_s)
paste("95% CI for the saving rate ATE:", round(cp_cond_ATE_ps_s[1], 3),
      "+/-", round(qnorm(0.975) * cp_cond_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: -0.098 +/- 0.042"
cp_cond_ATE_ps_s 
##    estimate     std.err 
## -0.09837554  0.02165855
  • absolute vs percentage
# test-train split
ap_split <- initial_split(ap_matched, prop = 0.5)
ap_train <- training(ap_split) #1979
ap_test <- testing(ap_split) #1980

X.ap.train = ap_train[, c(5:31)] 
X.ap.test = ap_test[, c(5:31)] 

Y.ap.train.v = ap_train$viewed
Y.ap.train.s = ap_train$saving_response

W.ap.train = ap_train$treat 

# cf estimation 
cf_ap_v <- causal_forest(
  X.ap.train,
  Y.ap.train.v,
  W.ap.train,
  tune.parameters = "all",
  seed = myseed
  )

cf_ap_s <- causal_forest(
  X.ap.train,
  Y.ap.train.s,
  W.ap.train,
  tune.parameters = "all",
  seed = myseed
  )

# out-of-bag predictions
tau_hat_ap_v = predict(cf_ap_v, X.ap.test)$predictions

tau_hat_ap_s = predict(cf_ap_s, X.ap.test)$predictions

# plot tau.hat
plot(density(tau_hat_ap_v),
     main = "Predicted view rate CATEs: absolute vs. percentage")

plot(density(tau_hat_ap_s),
     main = "Predicted saving rate CATEs: absolute vs. percentage")

# ATE
ap_ATE_ps_v = average_treatment_effect(cf_ap_v)
paste("95% CI for the view rate ATE:", round(ap_ATE_ps_v[1], 3),
      "+/-", round(qnorm(0.975) * ap_ATE_ps_v[2], 3))
## [1] "95% CI for the view rate ATE: NaN +/- NaN"
ap_ATE_ps_s = average_treatment_effect(cf_ap_s)
paste("95% CI for the saving rate ATE:", round(ap_ATE_ps_s[1], 3),
      "+/-", round(qnorm(0.975) * ap_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: NaN +/- NaN"
ap_ATE_ps_v
## estimate  std.err 
##      NaN      NaN
ap_ATE_ps_s
## estimate  std.err 
##      NaN      NaN

for the conditional df:

# test-train split
ap_cond_split <- initial_split(ap_cond_matched, prop = 0.5)
ap_cond_train <- training(ap_cond_split) #485
ap_cond_test <- testing(ap_cond_split) #486

X.ap.cond.train = ap_cond_train[, c(5:31)]
X.ap.cond.test = ap_cond_test[, c(5:31)]

Y.ap.cond.train.s = ap_cond_train$saving_response

W.ap.cond.train = ap_cond_train$treat

# cf estimation
cf_ap_cond_s <- causal_forest(
  X.ap.cond.train,
  Y.ap.cond.train.s,
  W.ap.cond.train,
  tune.parameters = "all",
  seed = myseed
  )

# out-of-bag predictions
tau_hat_ap_cond_s = predict(cf_ap_cond_s, X.ap.cond.test)$predictions

# plot tau.hat
plot(density(tau_hat_ap_cond_s),
     main = "Predicted conditional saving rate CATEs: absolute vs. percentage")

# save rate ATE conditional on viewing
ap_cond_ATE_ps_s = average_treatment_effect(cf_ap_cond_s)
paste("95% CI for the conditional saving rate ATE:", round(ap_cond_ATE_ps_s[1], 3),
      "+/-", round(qnorm(0.975) * ap_cond_ATE_ps_s[2], 3))
## [1] "95% CI for the conditional saving rate ATE: -0.136 +/- 0.112"
ap_cond_ATE_ps_s 
##    estimate     std.err 
## -0.13601824  0.05738161

CATE: focus only on control versus percentage

  • heterogeneity test
# BLP test for heterogeneity 


test_calibration(cf_cp_v, vcov.type = "HC3")
## 
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
## 
##                                Estimate Std. Error t value
## mean.forest.prediction         1.202439   0.140213  8.5758
## differential.forest.prediction 1.339288   0.076061 17.6081
##                                               Pr(>t)    
## mean.forest.prediction         < 0.00000000000000022 ***
## differential.forest.prediction < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_calibration(cf_cp_s, vcov.type = "HC3")
## 
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
## 
##                                Estimate Std. Error t value
## mean.forest.prediction          1.00424    0.10948  9.1730
## differential.forest.prediction  2.02009    0.26378  7.6582
##                                               Pr(>t)    
## mean.forest.prediction         < 0.00000000000000022 ***
## differential.forest.prediction   0.00000000000001141 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_calibration(cf_cp_cond_s, vcov.type = "HC3")
## 
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
## 
##                                Estimate Std. Error t value    Pr(>t)    
## mean.forest.prediction          1.05108    0.31592  3.3271 0.0004518 ***
## differential.forest.prediction  1.33736    0.31251  4.2794 0.0000101 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • variable importance
# viewed
vi.v = t(variable_importance(cf_cp_v))
colnames(vi.v) = names(X.cp.train)
barplot(vi.v, horiz = T, las = 2, cex.axis = .5, cex.names = .5,
        main = "Variable Importance for View Rate")

# saving_response  
vi.s = t(variable_importance(cf_cp_s))
colnames(vi.s) = names(X.cp.train)
barplot(vi.s, horiz = T, las = 2, cex.axis = .5, cex.names = .5,
        main = "Variable Importance for Saving Rate")

# conditional saving_response 
vi.cond.s = t(variable_importance(cf_cp_cond_s))
colnames(vi.cond.s) = names(X.cp.cond.train)
barplot(vi.cond.s, horiz = T, las = 2, cex.axis = .5, cex.names = .5,
        main = "Variable Importance for Conditional Saving Rate")

# pick important vars (vi > median)

impvar.v = colnames(vi.v)[vi.v>median(vi.v)]
print(impvar.v) 
##  [1] "goal_length"                   "week_target"                  
##  [3] "starting_capital"              "age"                          
##  [5] "tenure"                        "category_FINANCIAL_FREEDOM"   
##  [7] "category_HOME_LIVING"          "category_LIFE_EVENTS_PRESENTS"
##  [9] "category_SPORTS_EQUIPMENT"     "category_TRAVEL_HOLIDAY"      
## [11] "category_UNKNOWN"              "saving_mode_AUTOPILOT"        
## [13] "saving_mode_MANUAL"
impvar.s = colnames(vi.s)[vi.s>median(vi.s)]
print(impvar.s)
##  [1] "goal_length"                    "week_target"                   
##  [3] "starting_capital"               "age"                           
##  [5] "tenure"                         "category_FINANCIAL_FREEDOM"    
##  [7] "category_LEISURE_ENTERTAINMENT" "category_MOBILITY"             
##  [9] "category_SPORTS_EQUIPMENT"      "saving_mode_AUTOPILOT"         
## [11] "saving_mode_MANUAL"             "gender_FEMALE"                 
## [13] "gender_MALE"
impvar.cond.s = colnames(vi.cond.s)[vi.s>median(vi.cond.s)]
print(impvar.cond.s)
##  [1] "goal_length"                    "week_target"                   
##  [3] "starting_capital"               "age"                           
##  [5] "tenure"                         "category_FINANCIAL_FREEDOM"    
##  [7] "category_HOME_ENTERTAINMENT"    "category_LEISURE_ENTERTAINMENT"
##  [9] "category_MOBILITY"              "category_SPORTS_EQUIPMENT"     
## [11] "category_TRAVEL_HOLIDAY"        "saving_mode_AUTOPILOT"         
## [13] "saving_mode_MANUAL"             "gender_FEMALE"                 
## [15] "gender_MALE"                    "country_AT"
# evaluating heterogeneity with important var
## viewed
best_linear_projection(cf_cp_v,
                       cp_train[, impvar.v],
                       vcov.type = "HC3")
## 
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
## 
##                                    Estimate    Std. Error t value Pr(>|t|)  
## (Intercept)                   -0.3949048924  0.2945303330 -1.3408  0.18005  
## goal_length                    0.0154192143  0.0180787332  0.8529  0.39376  
## week_target                   -0.0000149135  0.0000136408 -1.0933  0.27432  
## starting_capital               0.0000023195  0.0000050749  0.4571  0.64765  
## age                            0.0049888455  0.0055451628  0.8997  0.36834  
## tenure                         0.0042192594  0.1201494592  0.0351  0.97199  
## category_FINANCIAL_FREEDOM    -0.6441629371  0.3428497382 -1.8788  0.06033 .
## category_HOME_LIVING           0.4363051241  0.2050044017  2.1283  0.03337 *
## category_LIFE_EVENTS_PRESENTS -0.3002309699  0.3597146834 -0.8346  0.40397  
## category_SPORTS_EQUIPMENT      0.3273064313  0.4781860602  0.6845  0.49371  
## category_TRAVEL_HOLIDAY       -0.0960605632  0.1680326108 -0.5717  0.56757  
## category_UNKNOWN               0.0292219999  0.1791152537  0.1631  0.87041  
## saving_mode_AUTOPILOT         -0.0940320474  0.1380621669 -0.6811  0.49585  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## saving_response
best_linear_projection(cf_cp_s,
                       cp_train[, impvar.s],
                       vcov.type = "HC3")
## 
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
## 
##                                     Estimate    Std. Error t value Pr(>|t|)
## (Intercept)                     0.1894794864  0.1368842825  1.3842   0.1664
## goal_length                    -0.0022500162  0.0036283735 -0.6201   0.5352
## week_target                    -0.0000193091  0.0000120154 -1.6070   0.1081
## starting_capital                0.0000011541  0.0000012699  0.9088   0.3635
## age                            -0.0041300447  0.0026393695 -1.5648   0.1177
## tenure                         -0.0057171328  0.0576847953 -0.0991   0.9211
## category_FINANCIAL_FREEDOM     -0.1768670494  0.1621538602 -1.0907   0.2754
## category_LEISURE_ENTERTAINMENT  0.0891049173  0.3104154032  0.2871   0.7741
## category_MOBILITY              -0.0888940653  0.1013703991 -0.8769   0.3806
## category_SPORTS_EQUIPMENT      -0.1392654732  0.2334292115 -0.5966   0.5508
## saving_mode_AUTOPILOT          -0.0129323100  0.0642830141 -0.2012   0.8406
## gender_FEMALE                   0.0349714714  0.0658781654  0.5309   0.5955
## conditional saving_response
best_linear_projection(cf_cp_cond_s,
                       cp_cond_train[, impvar.cond.s],
                       vcov.type = "HC3")
## 
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
## 
##                                      Estimate     Std. Error t value Pr(>|t|)  
## (Intercept)                     0.15420761843  0.11427044619  1.3495  0.17743  
## goal_length                    -0.00150482275  0.00349037642 -0.4311  0.66645  
## week_target                    -0.00000652300  0.00000465649 -1.4008  0.16152  
## starting_capital                0.00000011021  0.00000137807  0.0800  0.93627  
## age                            -0.00566523773  0.00234571654 -2.4151  0.01588 *
## tenure                          0.03593254710  0.08368361753  0.4294  0.66772  
## category_FINANCIAL_FREEDOM     -0.08731065694  0.09255146505 -0.9434  0.34568  
## category_HOME_ENTERTAINMENT     0.13722874207  0.14424629566  0.9514  0.34162  
## category_LEISURE_ENTERTAINMENT  0.34774970064  0.18580622215  1.8716  0.06151 .
## category_MOBILITY              -0.04600418635  0.09162355395 -0.5021  0.61569  
## category_SPORTS_EQUIPMENT       0.04338853191  0.07229981544  0.6001  0.54854  
## category_TRAVEL_HOLIDAY        -0.03840158337  0.05090036656 -0.7544  0.45073  
## saving_mode_AUTOPILOT          -0.00137652559  0.04624970190 -0.0298  0.97626  
## gender_FEMALE                  -0.00026143372  0.04567575247 -0.0057  0.99543  
## country_AT                     -0.08564785026  0.05424386784 -1.5789  0.11461  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Partial dependence plots
# plot CATE 
## view rate

par(mfrow=c(3,4))
for (k in 1:27) {
  X.cp.partial = matrix(0, 101, ncol(X.cp))
  
  X.cp.partial[, k] = seq(min(X.cp[, k]), max(X.cp[, k]), length.out = 101)
  
  tau_hat_cp_test_v = predict(cf_cp_v, X.cp.partial, estimate.variance = TRUE) 
  
  sigma_hat_v = sqrt(tau_hat_cp_test_v$variance.estimates)
  
  plot(X.cp.partial[, k], tau_hat_cp_test_v$predictions,
       ylim = range(tau_hat_cp_test_v$predictions + sigma_hat_v, tau_hat_cp_test_v$predictions - sigma_hat_v, 0, 0.01),
       
       xlab = colnames(X.cp)[k], ylab = "CATE", type = "l") 
  
  lines(X.cp.partial[, k], tau_hat_cp_test_v$predictions + sigma_hat_v,
        col = 1, lty = 2)
  
  lines(X.cp.partial[, k], tau_hat_cp_test_v$predictions - sigma_hat_v,
        col = 1, lty = 2) 
  
  abline(h = 0, col = 2)}

## saving rate

par(mfrow=c(3,4))
for (k in 1:27) {
  X.cp.partial = matrix(0, 101, ncol(X.cp))
  
  X.cp.partial[, k] = seq(min(X.cp[, k]), max(X.cp[, k]), length.out = 101)
  
  tau_hat_cp_test_s = predict(cf_cp_s, X.cp.partial, estimate.variance = TRUE) 
  
  sigma_hat_s = sqrt(tau_hat_cp_test_s$variance.estimates)
  
  plot(X.cp.partial[, k], tau_hat_cp_test_s$predictions,
       ylim = range(tau_hat_cp_test_s$predictions + sigma_hat_s, tau_hat_cp_test_s$predictions - sigma_hat_s, 0, 0.01),
       
       xlab = colnames(X.cp)[k], ylab = "CATE", type = "l") 
  
  lines(X.cp.partial[, k], tau_hat_cp_test_s$predictions + sigma_hat_s,
        col = 1, lty = 2)
  
  lines(X.cp.partial[, k], tau_hat_cp_test_s$predictions - sigma_hat_s,
        col = 1, lty = 2) 
  
  abline(h = 0, col = 2)}

## conditional saving rate 

par(mfrow=c(3,4))
for (k in 1:27) {
  X.cp.cond.partial = matrix(0, 101, ncol(X.cp.cond))
  
  X.cp.cond.partial[, k] = seq(min(X.cp.cond[, k]), max(X.cp.cond[, k]), length.out = 101)
  
  tau_hat_cp_cond_test_s = predict(cf_cp_cond_s, X.cp.cond.partial, estimate.variance = TRUE) 
  
  sigma_hat_cond_s = sqrt(tau_hat_cp_cond_test_s$variance.estimates)
  
  plot(X.cp.cond.partial[, k], tau_hat_cp_cond_test_s$predictions,
       ylim = range(tau_hat_cp_cond_test_s$predictions + sigma_hat_cond_s, tau_hat_cp_cond_test_s$predictions -
                      sigma_hat_cond_s, 0, 0.01),
       
       xlab = colnames(X.cp.cond)[k], ylab = "CATE", type = "l") 
  
  lines(X.cp.cond.partial[, k], tau_hat_cp_cond_test_s$predictions + sigma_hat_cond_s,
        col = 1, lty = 2)
  
  lines(X.cp.cond.partial[, k], tau_hat_cp_cond_test_s$predictions - sigma_hat_cond_s,
        col = 1, lty = 2) 
  
  abline(h = 0, col = 2)}

  • off policy evaluation
# view rate 
Y.cp.test.v = cp_test$viewed
W.cp.test = cp_test$treat

seg_cf_s1_v = tau_hat_cp_v < 0 # control
seg_cf_s2_v = tau_hat_cp_v >= 0 # percentage

optimal_allocation_v = c(sum(seg_cf_s1_v), 
                         sum(seg_cf_s2_v)) / length(Y.cp.test.v)

print(optimal_allocation_v)
## [1] 0.7438498 0.2561502
random_allocation_v = mean(Y.cp.test.v) 

statusquo_allocation_v = mean(Y.cp.test.v[W.cp.test == 0]) 

undifferentiated_allocation_v = mean(Y.cp.test.v[W.cp.test == 1]) 

optimal_allocation_v = mean(c(Y.cp.test.v[seg_cf_s1_v & W.cp.test == 0],
                              Y.cp.test.v[seg_cf_s2_v & W.cp.test == 1]))

offline_evaluation_v = round(rbind(
  random_allocation_v,
  statusquo_allocation_v,
  undifferentiated_allocation_v,
  optimal_allocation_v), round(3))
colnames(offline_evaluation_v) = "View rate"
rownames(offline_evaluation_v) = c("Status Quo", 
                                 "Random", 
                                 "Undifferentiated",
                                 "Optimized with CF")
print(offline_evaluation_v)
##                   View rate
## Status Quo            0.261
## Random                0.276
## Undifferentiated      0.238
## Optimized with CF     0.359
# saving rate 
Y.cp.test.s = cp_test$saving_response
W.cp.test = cp_test$treat

seg_cf_s1_s = tau_hat_cp_s < 0 # control
seg_cf_s2_s = tau_hat_cp_s >= 0 # percentage

optimal_allocation_s = c(sum(seg_cf_s1_s), 
                         sum(seg_cf_s2_s)) / length(Y.cp.test.s)

print(optimal_allocation_s)
## [1] 0.98683643 0.01316357
random_allocation_s = mean(Y.cp.test.s) 

statusquo_allocation_s = mean(Y.cp.test.s[W.cp.test == 0]) 

undifferentiated_allocation_s = mean(Y.cp.test.s[W.cp.test == 1]) 

optimal_allocation_s = mean(c(Y.cp.test.s[seg_cf_s1_s & W.cp.test == 0],
                              Y.cp.test.s[seg_cf_s2_s & W.cp.test == 1]))

offline_evaluation_s = round(rbind(
  random_allocation_s,
  statusquo_allocation_s,
  undifferentiated_allocation_s,
  optimal_allocation_s), round(3))
colnames(offline_evaluation_s) = "Saving rate"
rownames(offline_evaluation_s) = c("Status Quo", 
                                 "Random", 
                                 "Undifferentiated",
                                 "Optimized with CF")
print(offline_evaluation_s)
##                   Saving rate
## Status Quo              0.047
## Random                  0.061
## Undifferentiated        0.025
## Optimized with CF       0.062
# conditional saving rate
Y.cp.cond.test.s = cp_cond_test$saving_response
W.cp.cond.test = cp_cond_test$treat

seg_cf_s1_cond_s = tau_hat_cp_cond_s < 0 # control
seg_cf_s2_cond_s = tau_hat_cp_cond_s >= 0 # percentage

optimal_allocation_cond_s = c(sum(seg_cf_s1_cond_s), 
                         sum(seg_cf_s2_cond_s)) / length(Y.cp.cond.test.s)

print(optimal_allocation_cond_s)
## [1] 0.8527508 0.1472492
random_allocation_cond_s = mean(Y.cp.cond.test.s) 

statusquo_allocation_cond_s = mean(Y.cp.cond.test.s[W.cp.cond.test == 0]) 

undifferentiated_allocation_cond_s = mean(Y.cp.cond.test.s[W.cp.cond.test == 1]) 

optimal_allocation_cond_s = mean(c(Y.cp.cond.test.s[seg_cf_s1_cond_s & W.cp.cond.test == 0],
                                   Y.cp.cond.test.s[seg_cf_s2_cond_s & W.cp.cond.test == 1]))

offline_evaluation_cond_s = round(rbind(
  random_allocation_cond_s,
  statusquo_allocation_cond_s,
  undifferentiated_allocation_cond_s,
  optimal_allocation_cond_s), round(3))
colnames(offline_evaluation_cond_s) = "Conditional saving rate"
rownames(offline_evaluation_cond_s) = c("Status Quo", 
                                 "Random", 
                                 "Undifferentiated",
                                 "Optimized with CF")
print(offline_evaluation_cond_s)
##                   Conditional saving rate
## Status Quo                          0.192
## Random                              0.243
## Undifferentiated                    0.091
## Optimized with CF                   0.248